Cargando librerias

library(viridis)
## Warning: package 'viridis' was built under R version 4.2.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.2.3
library(Biostrings)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
library(DECIPHER)
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
## Loading required package: parallel
library(ade4)
## Warning: package 'ade4' was built under R version 4.2.3
## 
## Attaching package: 'ade4'
## The following object is masked from 'package:Biostrings':
## 
##     score
## The following object is masked from 'package:BiocGenerics':
## 
##     score
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.2.3
## 
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
## 
##     translate
library(adegenet)
## Warning: package 'adegenet' was built under R version 4.2.3
## 
##    /// adegenet 2.1.10 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(ape)
## Warning: package 'ape' was built under R version 4.2.3
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
## The following object is masked from 'package:Biostrings':
## 
##     complement
library(ggtree)
## ggtree v3.6.2 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
## 
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
## 
##     rotate
## The following object is masked from 'package:Biostrings':
## 
##     collapse
## The following object is masked from 'package:IRanges':
## 
##     collapse
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3

Guardando las variantes en variables

# NC_045512.2 Covid 19 
# NC_004718.3 sarscov 

# OX008586 Covid beta
# OW998408 covid alfa
# OX014251 covid gamma

# MZ937000  Covid murcielago
# KP849472 Covid perro


cov2 = read.GenBank("NC_045512.2")
sarsCov = read.GenBank("NC_004718.3")
covBeta = read.GenBank("OX008586.1")
covAlpha = read.GenBank("OW998408")
covGamma = read.GenBank("OX014251")
covOmicron = read.GenBank("OW996240")
covMurcielago = read.GenBank("MZ937000")
covCanino = read.GenBank("KP849472")

Guardando todas las secuencias en un archivo fasta

secuencias <- c(cov2, sarsCov, covBeta, covAlpha, covGamma, covOmicron, covMurcielago, covCanino)

write.dna(secuencias, file = "todas_secuencias.fasta", format = "fasta")

secuencias_adn <- readDNAStringSet("todas_secuencias.fasta", format = "fasta")
## Warning in .Call2("fasta_index", filexp_list, nrec, skip, seek.first.rec, :
## reading FASTA file todas_secuencias.fasta: ignored 19917 invalid one-letter
## sequence codes
secuencias_adn
## DNAStringSet object of length 8:
##     width seq                                               names               
## [1] 29903 ATTAAAGGTTTATACCTTCCCAG...AAAAAAAAAAAAAAAAAAAAAAA NC_045512.2
## [2] 29751 ATATTAGGTTTTTACCTACCCAG...AAAAAAAAAAAAAAAAAAAAAAA NC_004718.3
## [3] 29879 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OX008586.1
## [4] 29884 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OW998408
## [5] 29890 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OX014251
## [6] 29866 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OW996240
## [7] 29838 TAACAAACCAACCAACTTTCGAT...AAAAAAAAAAAAAAAAAAAAAAA MZ937000
## [8] 30004 ACTTTTAAAGTAAAAGTGAGTGT...CCTTTTGATAGTGATACACAAAA KP849472

Grafico con las longuitudes

longitudes <- width(secuencias_adn)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:XVector':
## 
##     slice
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
datos <- data.frame(
  x = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "Alphacoronavirus_1"),
  y = longitudes)

p <- ggplot(datos, aes(x = x, y = y))+
  geom_bar(stat = "identity") +
  scale_y_continuous(breaks = seq(29751, 30020, by = 20))+
  labs(
    title = "Longuitud secuencias",
    x = "ADN",
    y = "Numero nucleotidos"
  )
ggplotly(p)

Calculando la composicion de nucleotidos:

composicion_nucleotidos <- alphabetFrequency(secuencias_adn, baseOnly = TRUE)

composicion_nucleotidos
##         A    C    G    T other
## [1,] 8954 5492 5863 9594     0
## [2,] 8481 5940 6187 9143     0
## [3,] 8678 5324 5701 9319   857
## [4,] 8890 5462 5844 9567   121
## [5,] 8790 5393 5778 9442   487
## [6,] 8548 5256 5615 9255  1192
## [7,] 8938 5492 5837 9571     0
## [8,] 9083 5114 6015 9792     0

Graficando la composicion de nucleotidos

barplot(composicion_nucleotidos, main = "ComposiciĂ³n de nucleĂ³tidos", xlab = "NucleĂ³tidos", ylab = "Frecuencia")

Calculando el porcentaje de CG de cada variante

cantidad_cg_todos = c()
porcenteajes_cg_todos = c()

nombre = c("Covid 19", "SARS-COV", "Betacoronavirus", "Alphacoronavirus","Gammacoronavirus", "Omicron", "Banal-52", "covCanino")

for (i in 1:8){ 

porcentaje = (composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3]) * 100 / (composicion_nucleotidos[i,1]+composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3] + composicion_nucleotidos[i,4])
cantidad_cg = composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3]

cantidad_cg_todos = append(cantidad_cg_todos, cantidad_cg)

porcenteajes_cg_todos = append(porcenteajes_cg_todos, porcentaje)
cat("El porcentaje de CG del virus", nombre[i]," es:", porcentaje, "%\n")}
## El porcentaje de CG del virus Covid 19  es: 37.97278 %
## El porcentaje de CG del virus SARS-COV  es: 40.76166 %
## El porcentaje de CG del virus Betacoronavirus  es: 37.98842 %
## El porcentaje de CG del virus Alphacoronavirus  es: 37.98676 %
## El porcentaje de CG del virus Gammacoronavirus  es: 37.99272 %
## El porcentaje de CG del virus Omicron  es: 37.91239 %
## El porcentaje de CG del virus Banal-52  es: 37.96836 %
## El porcentaje de CG del virus covCanino  es: 37.09172 %

Generando una grafica que grafique el porcentaje de CG de cada virus

datos <- data.frame(
  x = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "covCanino"),
  y = porcenteajes_cg_todos)

p <- ggplot(datos, aes(x = x, y = y))+
  geom_bar(stat = "identity") +
  scale_y_continuous(breaks = seq(0,45, by = 2))+
  labs(
    title = "Porcentaje CG por virus",
    x = "Virus",
    y = "Porcentaje"
  )
ggplotly(p)

Cerando dataFrame con datos de los genomas

nombres = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "Alphacoronavirus_1")
filas = c("ID", "CANTIDAD CG", "PORCENTAJE CG")
id = c("NC_045512.2","NC_004718.3","OX008586.1","OW998408","OX014251","OW996240","MZ937000","KP849472")

df <- data.frame(matrix(nrow = 3, ncol = 8))
colnames(df) <- nombres
rownames(df) <- filas

df[1,] <- id
df[2,] <- cantidad_cg_todos
df[3,] <- porcenteajes_cg_todos
# Print the data frame
cat("El marco de datos resultante es:\n")
## El marco de datos resultante es:
print(df)
##                       Covid 19          sarsCov          covBeta
## ID                 NC_045512.2      NC_004718.3       OX008586.1
## CANTIDAD CG              11355            12127            11025
## PORCENTAJE CG 37.9727786509715 40.7616550704178 37.9884225759768
##                       covAlpha         covGamma       covOmicron
## ID                    OW998408         OX014251         OW996240
## CANTIDAD CG              11306            11171            10871
## PORCENTAJE CG 37.9867620871552 37.9927218311057 37.9123945037316
##                  covMurcielago Alphacoronavirus_1
## ID                    MZ937000           KP849472
## CANTIDAD CG              11329              11129
## PORCENTAJE CG 37.9683624907836   37.0917211038528

Alineando las secuencias

virus_seq_align <- AlignSeqs(secuencias_adn)
## Determining distance matrix based on shared 11-mers:
## ================================================================================
## 
## Time difference of 0.67 secs
## 
## Clustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0.01 secs
## 
## Aligning Sequences:
## ================================================================================
## 
## Time difference of 34.72 secs
## 
## Iteration 1 of 2:
## 
## Determining distance matrix based on alignment:
## ================================================================================
## 
## Time difference of 0.01 secs
## 
## Reclustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Realigning Sequences:
## ================================================================================
## 
## Time difference of 31.68 secs
## 
## Iteration 2 of 2:
## 
## Determining distance matrix based on alignment:
## ================================================================================
## 
## Time difference of 0.01 secs
## 
## Reclustering into groups by similarity:
## ================================================================================
## 
## Time difference of 0 secs
## 
## Realigning Sequences:
## ================================================================================
## 
## Time difference of 35.41 secs
BrowseSeqs(virus_seq_align)

Creando la matriz distancia

writeXStringSet(virus_seq_align, file="coronavirus_seq_align.fasta")
virus_aligned = read.alignment("coronavirus_seq_align.fasta", format = "fasta")

matriz_distancia = dist.alignment(virus_aligned, matrix = "similarity")

matriz_distancia = as.data.frame(as.matrix(matriz_distancia))
matriz_distancia

Creando una tabla a escala de grices d ela matriz distancia

table.paint(matriz_distancia, cleg = 0, clabel.row = 0.5, clabel.col = 0.5) + scale_color_viridis()

## NULL

Creando un arbol filogenetico

writeXStringSet(virus_seq_align, file="coronavirus_seq_align.fasta")
virus_aligned = read.alignment("coronavirus_seq_align.fasta", format = "fasta")
matriz_distancia = dist.alignment(virus_aligned, matrix = "similarity")

virus_tree <- nj(matriz_distancia)
virus_tree <- ladderize(virus_tree)
plot_virus_filogenia <- ggtree(virus_tree) + geom_tiplab() + ggtitle("Phylogenetic analysis of SARS-CoV genomes")
plot_virus_filogenia